
# Replication of analysis in: "Simulating pluralism: the language of democracy in hegemonic authoritarianism" Open Access in Political Research Exchange (PRX)
# https://doi.org/10.1080/2474736X.2019.1605834


##########################################################################
##### Dictonary-based logit scaling, autocracies and democracies  ########
##########################################################################


rm(list = ls())

# Set working directory:
setwd()


# load packages
library(quanteda)
library(tm)
library(stringr)
library(XML)
library(RCurl)
library(SnowballC)
library(xml2)
library(ellipse)
library(extrafont)
library(plotrix)
library(ggplot2)
loadfonts()

fonts() 

# PLEASE NOTE: SEE PAPER FOR PRECISE VERSIONS OF PACKAGES/R TO FULLY REPLICATE


# load data
# all scraped and "cleaned" speeches of each case as one .txt file
# autocracies and democracies, called audem.txt

# PLEASE NOTE: THE SPEECH DATA IS AVAILABLE UPON REQUEST (WRITE ME AN EMAIL)

for(i in 1:length(list.files("audem/"))){
  fileDir <- "audem/"
  speeches <- list.files(fileDir, pattern=".txt")
  speeches <- paste(fileDir, speeches, sep="/") 
  speeches <- unname(sapply(speeches, readLines))
}


# name each file as per case (important: has to be alphabetically ordered!)
names(speeches) <- c("Azerbaijan", "Canada", "Cameroon", "Denmark", "Germany", "Hungary", "Jordan", "Kuwait", "Kazakhstan", 
                    "Malaysia", "Morocco", "Norway", "North Korea", "Russia", "Saudi Arabia", "Singapore", 
                     "Tajikistan", "Turkey", "United Arab Emirates", "Uganda", "Great Britain", "Uzbekistan")


# create quanteda corpus
dic_speech <- corpus(speeches, 
                     metacorpus = list(notes = "Leaders' speeches"))

summary(dic_speech)

# turn into dfm
mydfm <- dfm(dic_speech)
mydfm


# load dictionary
#mydictio <- dictionary(file = "audemdic_valid.ykd", format = "yoshikoder", concatenator = "_",
#                       tolower = TRUE, encoding = "auto")
#mydictio

# for now, use function proposed by Benoit (current quanteda version has a bug concerning the function above)
# see here: https://stackoverflow.com/questions/43819992/quanteda-applying-yoshikoder-dictionary-with-multiple-levels
read_dict_yoshikoder <- function(path, sep=">"){
  doc <- xml2::read_xml(path)
  pats <- xml2::xml_find_all(doc, ".//pnode")
  pnode_names <- xml2::xml_attr(pats, "name")  
  get_pnode_path <- function(pn) {
    pars <- xml2::xml_attr(xml2::xml_parents(pn), "name")
    paste0(rev(na.omit(pars)), collapse = sep)
  }
  pnode_paths <- lapply(pats, get_pnode_path)
  lst <- split(pnode_names, unlist(pnode_paths))
  dictionary(lst)
}

mydictio <- read_dict_yoshikoder("audemdic_valid.ykd")


# make categories to character list to count terms
wordcount <- unlist(mydictio)
wordcount


# apply dictionary
dicdfm <- dfm(mydfm, dictionary = mydictio)
dicdfm
# implement statistical model as explained in the paper for each of the two subcategories

# but first, transform into data frame (easier to extract info)
model <- matrix(dicdfm, nrow = 22, ncol = 8)
model <- as.data.frame(model)
model[,9] <- row.names(dicdfm)
colnames(model)[9] <- "ID"
# make ID rownames
row.names(model) <- model[,9]
model[,9] <- NULL
model

# add up categories
model$autoproc <- model$V1 + model$V2
model$autoideo <- model$V3 + model$V4
model$demoideo <- model$V5 + model$V6
model$demoproc <- model$V7 + model$V8

# erase other colums
model[,c(1:8)] <- NULL
model


# apply model to estimate scale positions on first dimension: conservatism vs. liberalism
# (autoideo vs. demoideo)

# apply model, test in one case first:
# q is the estimate of the position on the liberlism-conservatism scale
q <- log((model$demoideo[22]+0.5)/(model$autoideo[22]+0.5))
q
# sigi are sigma values for the first dimension (i =ideological orientation)
# sqrt is the function which computes the square root of a numeric vector
sigi <- sqrt((model$demoideo[22]+0.5)^-1 + (model$autoideo[22]+0.5)^-1)
sigi

# now for all cases as loop
n <- nrow(model)
for (i in 1:n){
  Aideo <- model$autoideo[i]
  Dideo <- model$demoideo[i]
  q[i] <- log((Dideo+0.5)/(Aideo+0.5))
  sigi[i] <- sqrt((model$demoideo[i]+0.5)^-1 + (model$autoideo[i]+0.5)^-1)
}
q

# add scale for first dimension and sigma values as columns to model
model$scaleideo <- q
model$sigideo <- sigi
model



# apply model to estimate scale positions on second dimension: Autocratic vs. Democratic Procedures
# (autoproc vs. demoproc)

# apply model, test in one case first
# q is the estimate of the position on the democratic procedures-autocratic procedures scale
q <- log((model$demoproc[22]+0.5)/(model$autoproc[22]+0.5))
q
# sigp are sigma values for the second dimension (p =procedures)
# sqrt is the function which computes the square root of a numeric vector
sigp <- sqrt((model$demoproc[22]+0.5)^-1 + (model$autoproc[22]+0.5)^-1)
sigp

# now for all cases as loop
n <- nrow(model)
for (i in 1:n){
  Aproc <- model$autoproc[i]
  Dproc <- model$demoproc[i]
  q[i] <- log((Dproc+0.5)/(Aproc+0.5))
  sigp[i] <- sqrt((model$demoproc[i]+0.5)^-1 + (model$autoproc[i]+0.5)^-1)
}
q

# add scale for second dimension and sigma values as columns to model
model$scaleproc <- q
model$sigproc <- sigp
model


# now, calculate the confidence intervals for both dimensions
# based on the interval formula as suggested by Lowe (2011) 
# [q - 1.96xsigma, q + 1.96xsigma]
# x and y are the both dimensions ideo and proc:
model$xneg <- c(model$scaleideo-(1.96*model$sigideo))
model$xpos <- c(model$scaleideo+(1.96*model$sigideo))

model$yneg <- c(model$scaleproc-(1.96*model$sigproc))
model$ypos <- c(model$scaleproc+(1.96*model$sigproc))
model

model$group <- c(row.names(model))
model


# to illustrate confidence intervals as ellipses,
# subtract xpos and ypos from scaleideo and scaleproc
t <- c(model$scaleideo - model$xpos)
t
u <- c(model$scaleproc - model$ypos)
u


# plot scale and safe it in folder:
png(file = "C/..Scale.png",
   width = 1400, height = 1000)
x <- model$scaleideo
y <- model$scaleproc

pch.list <- c(8,0,8,0,0,2,8,15,8,17,17,0,15,17,15,8,8,2,15,17,0,8)

mar.default <- c(4,4,4,4) + 0.1
par(mar = mar.default + c(0, 4, 0, 0)) 
plot(x,y,
     ylab = c(expression("Autocratic Procedures    " %<->% "    Democratic Procedures")),
     xlab = c(expression("Illiberalism    " %<->% "    Liberalism")),
     cex.lab = 1.9,
     cex.axis = 1.5,
     xlim = c(-3.5,1.4),
     ylim = c(-1,1.1),
     pch = c(pch.list),
     cex = 1.9,
     abline(h=0,v=0, lty = 2))
text(x,y, labels=row.names(model), cex= 1.6, pos = 3)
draw.ellipse(c(x),c(y), a=t, b=u, col = rgb(0,0,0,alpha=0.1), border = rgb(0,0,0,alpha=0.1))
legend(x="bottomright", bty="n", pch=c(15,8,17,2,0,15), c("closed", "hegemonic", "competitive", 
                                                 "backsliding", "democratic","95% Region"),
       cex=c(1.9), col = (col = c(rgb(0,0,0,0,alpha=0.9), rgb(0,0,0,0,alpha=0.9), 
                                  rgb(0,0,0,0,alpha=0.9),rgb(0,0,0,0,alpha=0.9),
                                  rgb(0,0,0,0,alpha=0.9),rgb(0,0,0,0,alpha=0.15))))
dev.off()


# end of script!


